{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Use OffPAC to Play Acrobot-v1\n",
    "\n",
    "PyTorch version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import sys\n",
    "import logging\n",
    "import imp\n",
    "import itertools\n",
    "\n",
    "import numpy as np\n",
    "np.random.seed(0)\n",
    "import pandas as pd\n",
    "import gym\n",
    "import matplotlib.pyplot as plt\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim\n",
    "import torch.distributions as distributions\n",
    "torch.manual_seed(0)\n",
    "\n",
    "imp.reload(logging)\n",
    "logging.basicConfig(level=logging.DEBUG,\n",
    "        format='%(asctime)s [%(levelname)s] %(message)s',\n",
    "        stream=sys.stdout, datefmt='%H:%M:%S')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20:35:40 [INFO] env: <AcrobotEnv<Acrobot-v1>>\n",
      "20:35:40 [INFO] action_space: Discrete(3)\n",
      "20:35:40 [INFO] observation_space: Box(-28.274333953857422, 28.274333953857422, (6,), float32)\n",
      "20:35:40 [INFO] reward_range: (-inf, inf)\n",
      "20:35:40 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 15}\n",
      "20:35:40 [INFO] _max_episode_steps: 500\n",
      "20:35:40 [INFO] _elapsed_steps: None\n"
     ]
    }
   ],
   "source": [
    "env = gym.make('Acrobot-v1')\n",
    "env.seed(0)\n",
    "for key in vars(env):\n",
    "    logging.info('%s: %s', key, vars(env)[key])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "class OffPACAgent:\n",
    "    def __init__(self, env):\n",
    "        self.action_n = env.action_space.n\n",
    "        self.gamma = 0.99\n",
    "\n",
    "        self.actor_net = self.build_net(\n",
    "                input_size=env.observation_space.shape[0],\n",
    "                hidden_sizes=[100,],\n",
    "                output_size=env.action_space.n, output_activator=nn.Softmax(1))\n",
    "        self.actor_optimizer = optim.Adam(self.actor_net.parameters(), 0.0002)\n",
    "        self.critic_net = self.build_net(\n",
    "                input_size=env.observation_space.shape[0],\n",
    "                hidden_sizes=[100,], output_size=self.action_n)\n",
    "        self.critic_optimizer = optim.Adam(self.critic_net.parameters(), 0.0004)\n",
    "        self.critic_loss = nn.MSELoss()\n",
    "\n",
    "    def build_net(self, input_size, hidden_sizes, output_size,\n",
    "            output_activator=None):\n",
    "        layers = []\n",
    "        for input_size, output_size in zip(\n",
    "                [input_size,] + hidden_sizes, hidden_sizes + [output_size,]):\n",
    "            layers.append(nn.Linear(input_size, output_size))\n",
    "            layers.append(nn.ReLU())\n",
    "        layers = layers[:-1]\n",
    "        if output_activator:\n",
    "            layers.append(output_activator)\n",
    "        net = nn.Sequential(*layers)\n",
    "        return net\n",
    "\n",
    "    def reset(self, mode=None):\n",
    "        self.mode = mode\n",
    "        if self.mode == 'train':\n",
    "            self.trajectory = []\n",
    "            self.discount = 1.\n",
    "\n",
    "    def step(self, observation, reward, done):\n",
    "        if self.mode == 'train':\n",
    "            action = np.random.choice(self.action_n)\n",
    "            self.trajectory += [observation, reward, done, action]\n",
    "            if len(self.trajectory) >= 8:\n",
    "                self.learn()\n",
    "            self.discount *= self.gamma\n",
    "        else:\n",
    "            state_tensor = torch.as_tensor(observation, dtype=torch.float).unsqueeze(0)\n",
    "            prob_tensor = self.actor_net(state_tensor)\n",
    "            action_tensor = distributions.Categorical(prob_tensor).sample()\n",
    "            action = action_tensor.numpy()[0]\n",
    "        return action\n",
    "\n",
    "    def close(self):\n",
    "        pass\n",
    "    \n",
    "    def learn(self):\n",
    "        state, _, _, action, next_state, reward, done, next_action = \\\n",
    "                self.trajectory[-8:]\n",
    "        state_tensor = torch.as_tensor(state, dtype=torch.float).unsqueeze(0)\n",
    "        next_state_tensor = torch.as_tensor(state, dtype=torch.float).unsqueeze(0)\n",
    "\n",
    "        # train actor\n",
    "        q_tensor = self.critic_net(state_tensor)[0, action]\n",
    "        pi_tensor = self.actor_net(state_tensor)[0, action]\n",
    "        behavior_prob = 1. / self.action_n\n",
    "        actor_loss_tensor = -self.discount * q_tensor / behavior_prob * pi_tensor\n",
    "        self.actor_optimizer.zero_grad()\n",
    "        actor_loss_tensor.backward()\n",
    "        self.actor_optimizer.step()\n",
    "\n",
    "        # train critic\n",
    "        next_q_tensor = self.critic_net(next_state_tensor)[:, next_action]\n",
    "        target_tensor = reward + (1. - done) * self.gamma * next_q_tensor\n",
    "        pred_tensor = self.critic_net(state_tensor)\n",
    "        critic_loss_tensor = self.critic_loss(pred_tensor, target_tensor)\n",
    "        pi_tensor = self.actor_net(state_tensor)[0, action]\n",
    "        ratio_tensor = pi_tensor / behavior_prob # importance sampling ratio\n",
    "        critic_loss_tensor *= ratio_tensor\n",
    "        self.critic_optimizer.zero_grad()\n",
    "        critic_loss_tensor.backward()\n",
    "        self.critic_optimizer.step()\n",
    "\n",
    "\n",
    "agent = OffPACAgent(env)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20:35:41 [INFO] ==== train & verify ====\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\Programs\\anaconda3\\envs\\pytorch16\\lib\\site-packages\\torch\\nn\\modules\\loss.py:445: UserWarning: Using a target size (torch.Size([1])) that is different to the input size (torch.Size([1, 3])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.\n",
      "  return F.mse_loss(input, target, reduction=self.reduction)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20:35:44 [DEBUG] verify episode 0: reward = -500.00, steps = 500\n",
      "20:35:46 [DEBUG] verify episode 1: reward = -500.00, steps = 500\n",
      "20:35:49 [DEBUG] verify episode 2: reward = -500.00, steps = 500\n",
      "20:35:52 [DEBUG] verify episode 3: reward = -500.00, steps = 500\n",
      "20:35:55 [DEBUG] verify episode 4: reward = -500.00, steps = 500\n",
      "20:35:57 [DEBUG] verify episode 5: reward = -500.00, steps = 500\n",
      "20:36:00 [DEBUG] verify episode 6: reward = -500.00, steps = 500\n",
      "20:36:03 [DEBUG] verify episode 7: reward = -500.00, steps = 500\n",
      "20:36:05 [DEBUG] verify episode 8: reward = -500.00, steps = 500\n",
      "20:36:08 [DEBUG] verify episode 9: reward = -500.00, steps = 500\n",
      "20:36:11 [DEBUG] verify episode 10: reward = -500.00, steps = 500\n",
      "20:36:13 [DEBUG] verify episode 11: reward = -500.00, steps = 500\n",
      "20:36:16 [DEBUG] verify episode 12: reward = -500.00, steps = 500\n",
      "20:36:18 [DEBUG] verify episode 13: reward = -500.00, steps = 500\n",
      "20:36:21 [DEBUG] verify episode 14: reward = -500.00, steps = 500\n",
      "20:36:23 [DEBUG] verify episode 15: reward = -500.00, steps = 500\n",
      "20:36:26 [DEBUG] verify episode 16: reward = -500.00, steps = 500\n",
      "20:36:29 [DEBUG] verify episode 17: reward = -500.00, steps = 500\n",
      "20:36:31 [DEBUG] verify episode 18: reward = -500.00, steps = 500\n",
      "20:36:33 [DEBUG] verify episode 19: reward = -500.00, steps = 500\n",
      "20:36:36 [DEBUG] verify episode 20: reward = -500.00, steps = 500\n",
      "20:36:38 [DEBUG] verify episode 21: reward = -500.00, steps = 500\n",
      "20:36:41 [DEBUG] verify episode 22: reward = -500.00, steps = 500\n",
      "20:36:43 [DEBUG] verify episode 23: reward = -500.00, steps = 500\n",
      "20:36:46 [DEBUG] verify episode 24: reward = -500.00, steps = 500\n",
      "20:36:48 [DEBUG] verify episode 25: reward = -500.00, steps = 500\n",
      "20:36:50 [DEBUG] verify episode 26: reward = -500.00, steps = 500\n",
      "20:36:53 [DEBUG] verify episode 27: reward = -500.00, steps = 500\n",
      "20:36:55 [DEBUG] verify episode 28: reward = -500.00, steps = 500\n",
      "20:36:58 [DEBUG] verify episode 29: reward = -500.00, steps = 500\n",
      "20:37:00 [DEBUG] verify episode 30: reward = -500.00, steps = 500\n",
      "20:37:02 [DEBUG] verify episode 31: reward = -500.00, steps = 500\n",
      "20:37:05 [DEBUG] verify episode 32: reward = -500.00, steps = 500\n",
      "20:37:07 [DEBUG] verify episode 33: reward = -500.00, steps = 500\n",
      "20:37:10 [DEBUG] verify episode 34: reward = -500.00, steps = 500\n",
      "20:37:12 [DEBUG] verify episode 35: reward = -500.00, steps = 500\n",
      "20:37:14 [DEBUG] verify episode 36: reward = -500.00, steps = 500\n",
      "20:37:17 [DEBUG] verify episode 37: reward = -500.00, steps = 500\n",
      "20:37:19 [DEBUG] verify episode 38: reward = -500.00, steps = 500\n",
      "20:37:22 [DEBUG] verify episode 39: reward = -500.00, steps = 500\n",
      "20:37:24 [DEBUG] verify episode 40: reward = -500.00, steps = 500\n",
      "20:37:27 [DEBUG] verify episode 41: reward = -500.00, steps = 500\n",
      "20:37:29 [DEBUG] verify episode 42: reward = -500.00, steps = 500\n",
      "20:37:32 [DEBUG] verify episode 43: reward = -500.00, steps = 500\n",
      "20:37:34 [DEBUG] verify episode 44: reward = -500.00, steps = 500\n",
      "20:37:36 [DEBUG] verify episode 45: reward = -500.00, steps = 500\n",
      "20:37:39 [DEBUG] verify episode 46: reward = -500.00, steps = 500\n",
      "20:37:41 [DEBUG] verify episode 47: reward = -500.00, steps = 500\n",
      "20:37:43 [DEBUG] verify episode 48: reward = -500.00, steps = 500\n",
      "20:37:46 [DEBUG] verify episode 49: reward = -500.00, steps = 500\n",
      "20:37:48 [DEBUG] verify episode 50: reward = -500.00, steps = 500\n",
      "20:37:51 [DEBUG] verify episode 51: reward = -500.00, steps = 500\n",
      "20:37:53 [DEBUG] verify episode 52: reward = -500.00, steps = 500\n",
      "20:37:56 [DEBUG] verify episode 53: reward = -500.00, steps = 500\n",
      "20:37:58 [DEBUG] verify episode 54: reward = -500.00, steps = 500\n",
      "20:38:00 [DEBUG] verify episode 55: reward = -500.00, steps = 500\n",
      "20:38:03 [DEBUG] verify episode 56: reward = -500.00, steps = 500\n",
      "20:38:05 [DEBUG] verify episode 57: reward = -500.00, steps = 500\n",
      "20:38:08 [DEBUG] verify episode 58: reward = -500.00, steps = 500\n",
      "20:38:10 [DEBUG] verify episode 59: reward = -500.00, steps = 500\n",
      "20:38:12 [DEBUG] verify episode 60: reward = -500.00, steps = 500\n",
      "20:38:14 [DEBUG] verify episode 61: reward = -500.00, steps = 500\n",
      "20:38:16 [DEBUG] verify episode 62: reward = -500.00, steps = 500\n",
      "20:38:19 [DEBUG] verify episode 63: reward = -500.00, steps = 500\n",
      "20:38:21 [DEBUG] verify episode 64: reward = -500.00, steps = 500\n",
      "20:38:24 [DEBUG] verify episode 65: reward = -500.00, steps = 500\n",
      "20:38:26 [DEBUG] verify episode 66: reward = -500.00, steps = 500\n",
      "20:38:28 [DEBUG] verify episode 67: reward = -500.00, steps = 500\n",
      "20:38:31 [DEBUG] verify episode 68: reward = -500.00, steps = 500\n",
      "20:38:33 [DEBUG] verify episode 69: reward = -500.00, steps = 500\n",
      "20:38:36 [DEBUG] verify episode 70: reward = -500.00, steps = 500\n",
      "20:38:38 [DEBUG] verify episode 71: reward = -500.00, steps = 500\n",
      "20:38:40 [DEBUG] verify episode 72: reward = -500.00, steps = 500\n",
      "20:38:43 [DEBUG] verify episode 73: reward = -500.00, steps = 500\n",
      "20:38:45 [DEBUG] verify episode 74: reward = -298.00, steps = 299\n",
      "20:38:48 [DEBUG] verify episode 75: reward = -443.00, steps = 444\n",
      "20:38:50 [DEBUG] verify episode 76: reward = -319.00, steps = 320\n",
      "20:38:52 [DEBUG] verify episode 77: reward = -178.00, steps = 179\n",
      "20:38:55 [DEBUG] verify episode 78: reward = -302.00, steps = 303\n",
      "20:38:57 [DEBUG] verify episode 79: reward = -149.00, steps = 150\n",
      "20:38:59 [DEBUG] verify episode 80: reward = -156.00, steps = 157\n",
      "20:39:01 [DEBUG] verify episode 81: reward = -185.00, steps = 186\n",
      "20:39:04 [DEBUG] verify episode 82: reward = -203.00, steps = 204\n",
      "20:39:06 [DEBUG] verify episode 83: reward = -165.00, steps = 166\n",
      "20:39:08 [DEBUG] verify episode 84: reward = -112.00, steps = 113\n",
      "20:39:10 [DEBUG] verify episode 85: reward = -105.00, steps = 106\n",
      "20:39:13 [DEBUG] verify episode 86: reward = -187.00, steps = 188\n",
      "20:39:15 [DEBUG] verify episode 87: reward = -151.00, steps = 152\n",
      "20:39:17 [DEBUG] verify episode 88: reward = -210.00, steps = 211\n",
      "20:39:20 [DEBUG] verify episode 89: reward = -263.00, steps = 264\n",
      "20:39:22 [DEBUG] verify episode 90: reward = -241.00, steps = 242\n",
      "20:39:24 [DEBUG] verify episode 91: reward = -247.00, steps = 248\n",
      "20:39:26 [DEBUG] verify episode 92: reward = -208.00, steps = 209\n",
      "20:39:29 [DEBUG] verify episode 93: reward = -214.00, steps = 215\n",
      "20:39:31 [DEBUG] verify episode 94: reward = -218.00, steps = 219\n",
      "20:39:33 [DEBUG] verify episode 95: reward = -162.00, steps = 163\n",
      "20:39:35 [DEBUG] verify episode 96: reward = -163.00, steps = 164\n",
      "20:39:37 [DEBUG] verify episode 97: reward = -199.00, steps = 200\n",
      "20:39:40 [DEBUG] verify episode 98: reward = -210.00, steps = 211\n",
      "20:39:42 [DEBUG] verify episode 99: reward = -190.00, steps = 191\n",
      "20:39:44 [DEBUG] verify episode 100: reward = -320.00, steps = 321\n",
      "20:39:46 [DEBUG] verify episode 101: reward = -137.00, steps = 138\n",
      "20:39:49 [DEBUG] verify episode 102: reward = -131.00, steps = 132\n",
      "20:39:51 [DEBUG] verify episode 103: reward = -153.00, steps = 154\n",
      "20:39:53 [DEBUG] verify episode 104: reward = -136.00, steps = 137\n",
      "20:39:55 [DEBUG] verify episode 105: reward = -103.00, steps = 104\n",
      "20:39:58 [DEBUG] verify episode 106: reward = -101.00, steps = 102\n",
      "20:40:00 [DEBUG] verify episode 107: reward = -124.00, steps = 125\n",
      "20:40:02 [DEBUG] verify episode 108: reward = -130.00, steps = 131\n",
      "20:40:04 [DEBUG] verify episode 109: reward = -97.00, steps = 98\n",
      "20:40:06 [DEBUG] verify episode 110: reward = -118.00, steps = 119\n",
      "20:40:08 [DEBUG] verify episode 111: reward = -155.00, steps = 156\n",
      "20:40:11 [DEBUG] verify episode 112: reward = -169.00, steps = 170\n",
      "20:40:13 [DEBUG] verify episode 113: reward = -127.00, steps = 128\n",
      "20:40:15 [DEBUG] verify episode 114: reward = -135.00, steps = 136\n",
      "20:40:17 [DEBUG] verify episode 115: reward = -118.00, steps = 119\n",
      "20:40:19 [DEBUG] verify episode 116: reward = -117.00, steps = 118\n",
      "20:40:21 [DEBUG] verify episode 117: reward = -140.00, steps = 141\n",
      "20:40:23 [DEBUG] verify episode 118: reward = -108.00, steps = 109\n",
      "20:40:26 [DEBUG] verify episode 119: reward = -89.00, steps = 90\n",
      "20:40:28 [DEBUG] verify episode 120: reward = -112.00, steps = 113\n",
      "20:40:30 [DEBUG] verify episode 121: reward = -141.00, steps = 142\n",
      "20:40:32 [DEBUG] verify episode 122: reward = -114.00, steps = 115\n",
      "20:40:34 [DEBUG] verify episode 123: reward = -90.00, steps = 91\n",
      "20:40:34 [INFO] ==== test ====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20:40:34 [DEBUG] test episode 0: reward = -135.00, steps = 136\n",
      "20:40:34 [DEBUG] test episode 1: reward = -130.00, steps = 131\n",
      "20:40:35 [DEBUG] test episode 2: reward = -312.00, steps = 313\n",
      "20:40:35 [DEBUG] test episode 3: reward = -106.00, steps = 107\n",
      "20:40:35 [DEBUG] test episode 4: reward = -92.00, steps = 93\n",
      "20:40:35 [DEBUG] test episode 5: reward = -106.00, steps = 107\n",
      "20:40:35 [DEBUG] test episode 6: reward = -119.00, steps = 120\n",
      "20:40:35 [DEBUG] test episode 7: reward = -121.00, steps = 122\n",
      "20:40:35 [DEBUG] test episode 8: reward = -100.00, steps = 101\n",
      "20:40:35 [DEBUG] test episode 9: reward = -102.00, steps = 103\n",
      "20:40:35 [DEBUG] test episode 10: reward = -87.00, steps = 88\n",
      "20:40:35 [DEBUG] test episode 11: reward = -139.00, steps = 140\n",
      "20:40:35 [DEBUG] test episode 12: reward = -82.00, steps = 83\n",
      "20:40:35 [DEBUG] test episode 13: reward = -95.00, steps = 96\n",
      "20:40:36 [DEBUG] test episode 14: reward = -102.00, steps = 103\n",
      "20:40:36 [DEBUG] test episode 15: reward = -82.00, steps = 83\n",
      "20:40:36 [DEBUG] test episode 16: reward = -100.00, steps = 101\n",
      "20:40:36 [DEBUG] test episode 17: reward = -102.00, steps = 103\n",
      "20:40:36 [DEBUG] test episode 18: reward = -113.00, steps = 114\n",
      "20:40:36 [DEBUG] test episode 19: reward = -99.00, steps = 100\n",
      "20:40:36 [DEBUG] test episode 20: reward = -129.00, steps = 130\n",
      "20:40:36 [DEBUG] test episode 21: reward = -125.00, steps = 126\n",
      "20:40:36 [DEBUG] test episode 22: reward = -97.00, steps = 98\n",
      "20:40:36 [DEBUG] test episode 23: reward = -120.00, steps = 121\n",
      "20:40:36 [DEBUG] test episode 24: reward = -171.00, steps = 172\n",
      "20:40:36 [DEBUG] test episode 25: reward = -121.00, steps = 122\n",
      "20:40:37 [DEBUG] test episode 26: reward = -110.00, steps = 111\n",
      "20:40:37 [DEBUG] test episode 27: reward = -205.00, steps = 206\n",
      "20:40:37 [DEBUG] test episode 28: reward = -168.00, steps = 169\n",
      "20:40:37 [DEBUG] test episode 29: reward = -86.00, steps = 87\n",
      "20:40:37 [DEBUG] test episode 30: reward = -131.00, steps = 132\n",
      "20:40:37 [DEBUG] test episode 31: reward = -212.00, steps = 213\n",
      "20:40:37 [DEBUG] test episode 32: reward = -146.00, steps = 147\n",
      "20:40:37 [DEBUG] test episode 33: reward = -97.00, steps = 98\n",
      "20:40:37 [DEBUG] test episode 34: reward = -76.00, steps = 77\n",
      "20:40:38 [DEBUG] test episode 35: reward = -129.00, steps = 130\n",
      "20:40:38 [DEBUG] test episode 36: reward = -97.00, steps = 98\n",
      "20:40:38 [DEBUG] test episode 37: reward = -84.00, steps = 85\n",
      "20:40:38 [DEBUG] test episode 38: reward = -122.00, steps = 123\n",
      "20:40:38 [DEBUG] test episode 39: reward = -106.00, steps = 107\n",
      "20:40:38 [DEBUG] test episode 40: reward = -119.00, steps = 120\n",
      "20:40:38 [DEBUG] test episode 41: reward = -97.00, steps = 98\n",
      "20:40:38 [DEBUG] test episode 42: reward = -94.00, steps = 95\n",
      "20:40:38 [DEBUG] test episode 43: reward = -101.00, steps = 102\n",
      "20:40:38 [DEBUG] test episode 44: reward = -109.00, steps = 110\n",
      "20:40:38 [DEBUG] test episode 45: reward = -89.00, steps = 90\n",
      "20:40:38 [DEBUG] test episode 46: reward = -108.00, steps = 109\n",
      "20:40:38 [DEBUG] test episode 47: reward = -87.00, steps = 88\n",
      "20:40:39 [DEBUG] test episode 48: reward = -142.00, steps = 143\n",
      "20:40:39 [DEBUG] test episode 49: reward = -86.00, steps = 87\n",
      "20:40:39 [DEBUG] test episode 50: reward = -79.00, steps = 80\n",
      "20:40:39 [DEBUG] test episode 51: reward = -78.00, steps = 79\n",
      "20:40:39 [DEBUG] test episode 52: reward = -92.00, steps = 93\n",
      "20:40:39 [DEBUG] test episode 53: reward = -122.00, steps = 123\n",
      "20:40:39 [DEBUG] test episode 54: reward = -105.00, steps = 106\n",
      "20:40:39 [DEBUG] test episode 55: reward = -124.00, steps = 125\n",
      "20:40:39 [DEBUG] test episode 56: reward = -107.00, steps = 108\n",
      "20:40:39 [DEBUG] test episode 57: reward = -101.00, steps = 102\n",
      "20:40:39 [DEBUG] test episode 58: reward = -88.00, steps = 89\n",
      "20:40:39 [DEBUG] test episode 59: reward = -105.00, steps = 106\n",
      "20:40:40 [DEBUG] test episode 60: reward = -370.00, steps = 371\n",
      "20:40:40 [DEBUG] test episode 61: reward = -104.00, steps = 105\n",
      "20:40:40 [DEBUG] test episode 62: reward = -113.00, steps = 114\n",
      "20:40:40 [DEBUG] test episode 63: reward = -188.00, steps = 189\n",
      "20:40:40 [DEBUG] test episode 64: reward = -218.00, steps = 219\n",
      "20:40:40 [DEBUG] test episode 65: reward = -193.00, steps = 194\n",
      "20:40:40 [DEBUG] test episode 66: reward = -101.00, steps = 102\n",
      "20:40:40 [DEBUG] test episode 67: reward = -120.00, steps = 121\n",
      "20:40:40 [DEBUG] test episode 68: reward = -109.00, steps = 110\n",
      "20:40:41 [DEBUG] test episode 69: reward = -113.00, steps = 114\n",
      "20:40:41 [DEBUG] test episode 70: reward = -95.00, steps = 96\n",
      "20:40:41 [DEBUG] test episode 71: reward = -85.00, steps = 86\n",
      "20:40:41 [DEBUG] test episode 72: reward = -100.00, steps = 101\n",
      "20:40:41 [DEBUG] test episode 73: reward = -95.00, steps = 96\n",
      "20:40:41 [DEBUG] test episode 74: reward = -286.00, steps = 287\n",
      "20:40:41 [DEBUG] test episode 75: reward = -118.00, steps = 119\n",
      "20:40:41 [DEBUG] test episode 76: reward = -92.00, steps = 93\n",
      "20:40:41 [DEBUG] test episode 77: reward = -121.00, steps = 122\n",
      "20:40:41 [DEBUG] test episode 78: reward = -97.00, steps = 98\n",
      "20:40:42 [DEBUG] test episode 79: reward = -219.00, steps = 220\n",
      "20:40:42 [DEBUG] test episode 80: reward = -93.00, steps = 94\n",
      "20:40:42 [DEBUG] test episode 81: reward = -99.00, steps = 100\n",
      "20:40:42 [DEBUG] test episode 82: reward = -121.00, steps = 122\n",
      "20:40:42 [DEBUG] test episode 83: reward = -120.00, steps = 121\n",
      "20:40:42 [DEBUG] test episode 84: reward = -112.00, steps = 113\n",
      "20:40:42 [DEBUG] test episode 85: reward = -91.00, steps = 92\n",
      "20:40:42 [DEBUG] test episode 86: reward = -100.00, steps = 101\n",
      "20:40:42 [DEBUG] test episode 87: reward = -104.00, steps = 105\n",
      "20:40:42 [DEBUG] test episode 88: reward = -154.00, steps = 155\n",
      "20:40:42 [DEBUG] test episode 89: reward = -159.00, steps = 160\n",
      "20:40:43 [DEBUG] test episode 90: reward = -126.00, steps = 127\n",
      "20:40:43 [DEBUG] test episode 91: reward = -94.00, steps = 95\n",
      "20:40:43 [DEBUG] test episode 92: reward = -100.00, steps = 101\n",
      "20:40:43 [DEBUG] test episode 93: reward = -106.00, steps = 107\n",
      "20:40:43 [DEBUG] test episode 94: reward = -114.00, steps = 115\n",
      "20:40:43 [DEBUG] test episode 95: reward = -103.00, steps = 104\n",
      "20:40:43 [DEBUG] test episode 96: reward = -89.00, steps = 90\n",
      "20:40:43 [DEBUG] test episode 97: reward = -112.00, steps = 113\n",
      "20:40:43 [DEBUG] test episode 98: reward = -96.00, steps = 97\n",
      "20:40:43 [DEBUG] test episode 99: reward = -101.00, steps = 102\n",
      "20:40:43 [INFO] average episode reward = -120.20 ± 46.76\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAD4CAYAAAAEhuazAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deZRcZ3nn8e9T1V3Vu9Sb1pbUkixkywZsubENBmKCwQpJsMMkczyEAwnJOPhAJmEmIXGcQDKJzzAhmSQMMcQDhHUwngCxQ2LWELZ4QTbCq2Rr3yx3t9RS77W+88e9t7qqunqtrm7dqt/nnD7qulVdde9V9dNPPe9z39ecc4iISG2JrPQOiIjI8lPwFxGpQQr+IiI1SMFfRKQGKfiLiNSgupXegfnq6upyvb29K70bIiKh8thjjw0657qLt4cm+Pf29rJ3796V3g0RkVAxs2OltqvsIyJSgxT8RURqkIK/iEgNUvAXEalBCv4iIjVIwV9EpAYp+IuI1CAFfxGRZTYymeK+vSeYa0r98WSaRDpTkX1Q8BcRWWafe/g47/uHJ/jJyQuzPu4Lj57gpR/4BkNjySXfBwV/EZFl9v3nBwB4/NjQrI/bd+I8XS0x2ptjS74PZQV/M/slM3vazLJm1ld03x1mdtDMDpjZTXnbrzazJ/37PmxmVs4+iIiEyUQyw96jXtB//PhcwX+IKzevrsh+lJv5PwW8Bfhe/kYz2wXcClwO7AHuNrOof/dHgduAHf7XnjL3QURkyWSyjj//2n5OnZ+oyPM/evQcyUyWrpb4rJn/2dEEJ85NcOWmizD4O+eedc4dKHHXzcC9zrmEc+4IcBC4xszWA23OuYecN9LxGeCWcvZBRGQpHTgzwt3/doh7Hz1ekef//nMDxKIRfvX6Xk5fmOTMhcmSj9t34jwAV25qr8h+VKrmvxE4kXf7pL9to/998XYRkWVx/Ow4f/H1A3zwwf0l7z80MArAI0fOVeT1f3BwkL7edq6/pAuYufSz78R5ohHjio1tFdmPOad0NrNvAetK3HWnc+7+mX6sxDY3y/aZXvs2vBIRmzdvnmNPRURm5pzjt7+4j/v3nc5t+7VXb6W7NV7wuCD47ztxnslUhob6KEulf3iS/WdGeN+enexa30a8LsLjx4Z400vXT3vsvhPnecnaVppilZl5f87M3zl3o3PuihJfMwV+8DL6TXm3e4DT/vaeEttneu17nHN9zrm+7u5paxGIyDL5+tNneO8X983Zl34x6x9JcP++07zlqo387Vt3A6Wz7kMDYwAk09lc6WWp/ODgIACv3dFNrC7CSzeu4rES+5DNOvadOF+xej9UruzzAHCrmcXNbCvewO6jzrkXgBEzu87v8nk7MNsfERFZYYcHRnnvF/fxlR+fIpHOrvTuLNqhfi+j/8Wre7hx1xpi0UjJAddD/aPs3rwaM3h0iUs/P3h+kI7mGLvWe6Wc3VvaefrU8LQLuQ4PjjEymeaqizX4m9kvmNlJ4JXAP5vZ1wGcc08D9wHPAF8D3u2cC47uduDjeIPAh4AHy9kHEamcRDrDb37hx4wnvV/f4YnUCu/R4gXlnG3dLcTroly+sW1a5p/NOg4PjrJ7czuXrmvjkSNnl3QfHj16juu2dRCJeBXw3ZtXk8xkeerUcMHjcoO9FWrzhPK7fb7inOtxzsWdc2udczfl3XeXc267c26nc+7BvO17/bLRdufce1yYP0eKVLkPPrifp08Pc8uVGwAYngxz8B+jORZlbZtX49+9uZ0nTl4gmfdp5vSFCSZTWbavaeHarR08dmyo4P5yjCfTnBya4NJ1UwO4uzd7nTw/LvojtO/EEC3xOrZ3tyzJa5eiK3xFpKRM1vHZh47xi1f38Au7vaG6CxPpFd6rxTs0MMr2NS0E15VevaWdRDrLMy8M5z3Gq/dv7/aC/2Qqy5OnZp+CYb4O+899yZqpgL6mrYGNqxunjS3sO3Gel/WsIhqp3DWwCv4iUtLZsQTprOPlPatoa/A6TsJc9jk8MFaQSQdZd37dPxgX2NbdzDVbOwAWXfopLmoc9J87P/gDXLa+lQNnRnK3E+kM+18Y4WU9lSv5gIK/iMygfzgBQHdrA22N9UB4yz7jyTSnzk+wvbs5t23dqgY2rGoo6LY5NDDKqsZ6OptjdLbE2bGmhUcOz3/QdzSR5r//0zO85e4fsuv9X+eOLz+Zu+9g/yjRiNHb2VzwMzvXtXJ4cCw36Pv8i6Oks65i/f0BBX8RKWlgxAv+a9ritDX4wT+kmf/hvHJOvqu2tPPjvMzf+3TQnCsN7d7czlMLKPv8009O88kfHiFixsb2Rr717Iu5TwDP94+wpaOJWF1h2L10XRuZrMt9Mnj6tPd6l29YtcCjXBgFfxEpqX/Em3aguyVOa1D2mQxnzT/o9NleVHLZvbmd0xcmeeHCRO5x+X8geruaOTuWZGSen3i+/eyLbFzdyP971yt5x6t6GRhJcHLIe+6D/aPTXh/g0nWtALnSz9Onh2mORdnS0bTAo1wYBX8RKWmq7BOnoT5KvC4S2sz/0MAYEYMtnYUB9eotXt3/kcPnGJ5M0T+SKAjQweOPnR2f8zUmkhl+cHCQN+xai5mx22/TfPz4EKlMlmNnx9lRIvhv7WomFo3kgv8zp4e5bH1brh20UhT8RaSk/pEEqxrrc9MbtDXWh7bmf2hglE0dTcTrCqdquHxDG72dTfzpV5/h3w54c+znZ/5B8D9+rnTwP5G3/YcHB5lMZXn9ZWsA2Lm2laZYlMePDXHs7BjprJs22AtQF41wyZoWnj0zQjbrePaFYS7fUNl6Pyj4i8gMBkYSrMmb96atoY7hkLZ6HuofLdkzXx+N8IlfeQXprON37vsJQMGg8BZ/cPbo2bFpP7v/zDCv+fPv8NmHjwHw7f0v0hKv49qtnYAX1F/es5rHj5+fsdMncOm6Vg6cGebo2THGkpmK1/tBwV9EZtA/MsmatrzgH4LMfzKVyQ2YBjJZx5HBsYKgnm97dwsfe9vVOBx1EWNTXq29JV5HV0uM4yXKPs+/6AX0D/7Ls5w6P8G3nu3np17SXTCge/WWdp55YZgn/OUaZ7po69L1rbw4nOCHh7y20l3K/EVkpfSPJFjT2pC73dZQf1HX/NOZLO/63GP8/P/+Af3DU3Pknz4/QSKdnfVq2Vdu7+Qjb93Nb/70DuqjhWFxS2dzycw/GMjNOMevfPJRBkYS3LhrTcFjdm9ZTSbruH/faTasaqA5XnqGzp3+Vb9ffvwkdRFjx9rKXdkbUPAXkWmcc37wL878L86yj3OOP/knr26fdfCjo4W9+zC906fYTZev47du3DFt+5aOppIDvqfOj7O6qZ7feeNOnu8fJWJww0sKg/9V/kIsp85PzPr6l/kdPz8+fp4da1unjU1UgoK/iEwzPJEmmc4WzHXv1fwvzsz/7394lM8+fIx3Xr+VhvoIPzo6dWFWUG/f1lW67DOXLZ3NvHBhkslU4cybJ4cm2Li6kV+9fitXb2nnNTu6py203t4cY5tfbtqxpnXG1+hujdPe5F1LsRyDvaDgLyIl5Hr8p2X+qYtyTv+PffcQr9reyR/+7GVctamdvcemgv/3nx+kt7OJzpb4LM8ws94ubwzgRFHHz6mhCXraG4lGjC/85+v4+Dv6Sv58MI3ETIO9AGaWm/BNwV9EVkx/cHVvUc0/lXFMpi6+Of0nkhl2rmslEjFe0dvOM6eHGU2kGU2keejQWW68bO2in3tzx/Ref+ccp85PsHG1d1+sLjJtrCAQXEswVx1/p1/6Ceb6r7TKrA8mIqGWP7VDYFXe/D6NscrXpBcikc7m6uR9vR1kHew7fp6RyRTJTJYbdy0++PeWaPccGk8xnsywsb1xzp+/5cqNZJ3j6s2zL8R+w85uvvvcAFdsrHybJyj4i0gJQdmncMB3ambPtW0NJX9uJTjnSGayuRbLqzavJmLwo6PnODk0warGevq2zB54Z7O6qZ7WhrqCzP+U3+nTM4/g3xiL8svXbpnzcTfsXMMNO9fM+bilouAvItP0DydorI/SkteamJvc7SLr9U9mvDJU3A/+rQ31XLbeW4XruRdHed3ObupmKMnMh5k3E+exvJr/qfPe9xtXzx38L1aq+YvUiP7hSf7Hg88ylpi7XbN/JMGatnhudktgalrni+wq32ClrVhegH9FbwcPHz7HubFkWSWfwJbOJo7llX1OLiDzv1gp+IvUiB8cHOTvvnuYP7r/qTkf2z8yWVDyAaYWdPEz/8ePD3HP9w4t/Y4uUC74511Z29frlXnqIsZrX9Jd9mts6Wzi5NAEKf9TxsmhCVridblxkDBS8BepEUGXzpcfP8WXHjs562P7RxIFbZ6Qn/l7wf/zDx/nf37tANnsyrZ+JtKFZR+Avi3eKlzXbevMlavKsaWzmUzWcfq8l/F7nT6NBZ+MwkbBX6RGBBcp7Vrfxh/d/xSH/StfSxkomtoBmDan/7GzY2SyjgsrfOFXqcx/3aoGfuVVvdz22m1L8hrB3PqHB73Sz8mhiXl1+lzMFPxFakSQIX/krVcRjRgf+deDJR83mcowMpmelvnH66I01EdywT5ofRwcTVRwr+cWDPgWr5D1x2++fElKPgCXbWgjVhfh3/b3A3BqaDzU9X5Q8BepGUHm39vZzCt6O3j69HDJxwWLuBTX/GFqcreRyRSDo0mA3L8rpdSA71Jra6jnDbvW8sBPTnN2NMHwZDrUnT6g4C9SMxLpLLFohEjEuHRdK4cGRnOBM1+ux79EL38wxUN+z/vZsaXJ/A/2j/L1p89wcmh8QVNIJEqUfSrhF3f3MDSe4vOPHAcIfdlHff4iNWIylckNiu5c10o66zg0MMplRdMJTE3tUCrz9xZ0OTI41fY4OLI0wf8PvvIkjx45l3vtf3z39WyYR3adSHufaCo9E+ZrdnTR1RLjEz84AoS7xx+U+YvUjEQ6Q9xfkjGYRCxYNzZfMBd+yeCfy/y94B8xODu2NGWfMxcmuf6STt5740voH0nwxMnz8/q5UgO+lVAXjXDzlRtzYx497ZVdYL3SFPxFakQilaWh3vuV39bdTH3U2F8i+J8cmqChPkJH0fTEMFXzPzI4ztq2OB3N8SWr+Q+MJLhsXRu/9pqtABwZnHvRdJgK/vEKB3+At+zemHutrpbp5ydMVPYRqRGT6amyT300wvbuFvafmT7oe+zcOJs7mkr2sLc11jE8mebY2TF6O5u5MJFakm6fsUSaiVSGrta4v3RivOCK2tnM1O1TCbvWt3HpulZSmWyoe/xBwV+kZniZ/1Rd/LL1bTx8+Oy0xx0/O87mjtILnwSZ/9GzY7z+0rWcPD/O2SUI/sEsot3+nPu9nU0F4wqzWY5un4CZ8Te3XsVY8uKa4mIxVPYRqRGT6UxB8N+5rpUXLkxyYXzqIi3nHMfPjbOls3Q9u62xnnTWMTiapLerma6W+JLU/Af8PyBd/jhDb1fpdXNLyV3hW7884WznutbcAi1hpuAvUiMmU9mCuniweEh+6WdgNMFEKpNbwKRY/lQJvZ1NdDbHl6TbZ7BE5v/icILxeWTYy5n5VxOdLZEqdGE8xQfuf6pg3dlEUeZ/md/xkz/oe9zv3988Y+Y/VSnu7WqmsyXGWDLDRDJT8vHzNZX5x3LPDZRcOL3YcnX7VBudLZEq9PCRs3z6oWMFV/EWZ/5r2+KsaqwvCP5BsN0yj8x/S2dTLlMv90KvwZEEEYPO5iDzD4L/3KWf5RzwrSY6WyJVKMj48zPy4szfzNi5rpUDeWWf4+fGMZv56tVgCuO1bXGaYnV0+u2O5bZ7Dowm6GiOEY14HTRB5j+fds+Eyj6LorMlUoVywT+v7DOZ1+cfuGxdKwfOjOSmZT5+bpwNqxpnvFo2mNY5yMw7g8y/zI6fgZEkXS1TF5UF7Z5H59Hxk0hniNVFQt96udwU/EWqUDB3f/6AqTe9Q2FQ37WhjbFkhkP+9M7Hzo7NONgLUwu6BME/uNDp7BJk/sWziPZ2Ns2r4yeZzhJX1r9gOmMiVSjI+AsHfLPT2iGvv6QLgO8+NwB4mf+swb+xns7mGLu3rAamavQDZWb+gyOJ3PhBYL7tnsl0VvX+RdAZE6lCQdAf92v+2azzMuSizL+nvYlL1rTw3ecGGEukGRxNztjpA96VwQ/d8Xr+Y98mABpjUZpj0bIyf+ccA6OJXI9/YKZ2z/963z7+/odHcrcV/BdHZ0ykCk2VfbzgH3TEFNf8AV63s5tHDp/jwIte189MF3gFiuvrnS3xsrp9RhJpkulsycwfCts9z48n+cqPT+Vm/wTv2JZjXp9qU9YZM7MPmdl+M3vCzL5iZqvz7rvDzA6a2QEzuylv+9Vm9qR/34dNozQiS26yqOwT/NtQYiD3hp1rSGayfPHREwCzln1K6WqJlTW/TzC1Q9DjHwjGFfIHfR85cg7nCgeyEyll/otR7hn7JnCFc+5lwHPAHQBmtgu4Fbgc2APcbWbBu+6jwG3ADv9rT5n7ICJFiss+wSeBUlMg9PW20xSL8o/7TgGwZYZ5fWbS2RIvq+wzdXVv4eIxQeZ/NC/zf+iQNxdRfgtrMqPgvxhlnTHn3Decc0FB7mGgx//+ZuBe51zCOXcEOAhcY2brgTbn3EPOW6rnM8At5eyDiExXHPyDBU9KZf7xuiiv2t5FIp1lVWM9q5rqpz1mNl7mv/jgX3x1b6BUu+e/HxoECgeyk/4KZbIwS3nG3gk86H+/ETiRd99Jf9tG//vi7SKyhIq7fWbL/AFu2OktdL7Qkg9AV0ucc2MJMtn5L72Yr3hGz3xXblrFt/f3M5nKMDCS4LkXvZbUieLgr8x/weY8Y2b2LTN7qsTXzXmPuRNIA58PNpV4KjfL9ple+zYz22tmewcGBubaVRHxFff5z5b5Q17wn2Owt5TO5hhZ5w3GLsbgaIJoxGhvmr44yjtfvZXB0QRfevwkD/nTT2/vbi6s+WemdzHJ3Oacz985d+Ns95vZO4CfA17vplZdPglsyntYD3Da395TYvtMr30PcA9AX1/f4tIKkRo0MUPNP396h3w97U287brNvPqS7gW/VnCV7+BoMvf9QgyMJOhsjhGJTM8NX7mtk5dvWs3fffcw123roLWhjr4tHXx7/4u5xyRSGWIllpyU2ZXb7bMH+D3gzc65/Ek4HgBuNbO4mW3FG9h91Dn3AjBiZtf5XT5vB+4vZx9EZLrEDN0+s815/2e3vJQ9V6xb8Gt1LWCKh/ypJAKDo8lpV/cGzIzbf2obx8+N86XHT3Ht1k5aGuo04LsEyj1jHwFagW+a2T4z+xiAc+5p4D7gGeBrwLudc8H/1u3Ax/EGgQ8xNU4gIkukuM8/mPxsprJPOYIpHgbnWNTlYP8IN/319/jAA08zVSTwMv+uWT4xvHHXOrZ1N5PJOl61vZPG+igTqUzuOTS9w+KU2+1ziXNuk3PuSv/rXXn33eWc2+6c2+mcezBv+17n3BX+fe9x+e8CEVkSE0UTu+X6/Cuw2lVQ6hmYY1GXg/3eYO1nHz7GJ394NLd9sMS8PvkiEeM9r7uEiMFrX9JNYyxK1k1duKYB38XRGr4iVah4SufcUocVyPzbm+qpj9qcwf/4Oa8y/NqXdPNn//wMPe2NvHHXWgZHZ8/8Ad6yu4dX7+hiTWsD33vOO4bJpDfQqyt8F0dnTKQKTb/Iq3KZv5nR3RKnf2Ry1scdPzfOqsZ6/u5tV/OyntXc/rnH+MtvPEcq42bN/ANrWr2LwBpjXvAPPtXoCt/F0RkTqUJBzb+47FOplsju1vg8Mv8JNnc00RiL8vlfv5Y9V6zjI985CEyNG8xHY31h8NeA7+LojIlUmUzWkcxkiUaMZDpLJuumyj4VyPwBulsb6B+ePfifzJsuuiVex9++dTd/+LOX0dUS44qNq+b9WkG76kQyQybryGQdsaj6/BdKwV+kygQXdAUXTY0n07nWz0rVxte0xWed0z+TdZwcmqCnY2p5SDPj11+zjb1/+Aa2d7fM+7Wmyj5pLd5eBp0xkSoTDPJ2NHtz9EykMt5CLhVc6nBNa5xzY8lcMC724vAkyUx2UdNHFMuVfZLZ3OtpwHfhdMZEqsykHxCDzH8imWEylZnx6t6lEAzGzjS1c9Dps6TBP5XJfcpR5r9wOmMiVSYY3J0q+2SYTFW2HXKN363TP8Og75IG/5h3HMEnGlDwXwydMZEqE5R92pv9zN/PkCua+bf5wX+4dLvnyXPjRAw2rG4sef9CBMcxmczkLvRS2WfhdMZEqkxQCulszi/7ZCvS4x8Iyj6zZf4bVjdSvwTTMDTFvGtTJ1KZqQFfTe+wYDpjIlUm6PFf7S/KMpH0Mv9KTnvc2RLDbPbgv6m9/JIPFNb8kxVuYa1mOmMiVWaq28ev+acqn/nXRyN0NMUYmOEq3+ACr6UQlHi8P2pB5q8+/4VS8BepMpPpopp/Ml3xzB9mvsp3IplhcDSxqIViSolEjIb6SGHZRzX/BdMZE6kyQdmno2n5av4Aa9oaCso+39nfz1gizYkhr9Nn0xJl/uCVfiaSGZIZtXouls6YSJUJ5rwpKPukM8Qr2O0DXrtnMMXD/jPD/OqnfsS7PvcYhwe8qZyXquwD5Ob014Dv4mlKZ5EqE0zlsKqpHjO/Nl7hPn/wgv/gaIJs1vHokXMAfP/5QZ73F13f1F5+m2egIRYt6PPXgO/C6YyJVJnc9M110Vx5pNJ9/uAF/3TWcW48yY+ODrF+VQO337CdM8OTNMeiuU8iS6GxPspkwYCvQtlCKfMXqTITqQzRiFEfNZpiUcZTy5P5dwe9/sMJHjt6jr7eDn73jTsZHEkwPJla0nmFmmKFZR9d5LVwCv4iVWYylaXBn8StMeZlyJPLkfn7V/nuO3Ge0xcm+Y0t7UQixod+6eVL/loN9VFGE5rVsxw6YyJVZiKVyU173FgfZSSRJpVxFVm8PV8wv8+DT70AQF9ve8Vea6rbR8F/sXTGRKrMZGqqp78xVsf58SRQ+UHRYIqHfz90lpZ4HZeua6vYazXG1O1TLp0xkSqTyOvpb6yPMDSeAqChwtlxYyxKa7yOTNaxe0s70Uhl1g6AvMw/7a1YVqfgv2A6YyJVJr/s05SX+Ve65g/Q7df9X7GlciUf8I4lmK1UWf/i6KyJVJnJVCZX32+MRXOZ/3L0wgd1/77ejoq+TmMsyqRf9lG9f3F01kSqTP6qXY31UTJZB1DxAV/w6v51EePKTasr+jqN9VFSGcd4MqPgv0hq9RSpMhOpLB3NQdlnKuAvR+b/y9du5qrNq3Nlp0oJpnW+MJFSj/8iKfiLVJlEKpM34DsVhJcj8792WyfXbuus+OsEf1wuTKSU+S+SzppIlSko+xRk/tUz531+5q8B38XRWROpMhOpTC44FpR9qihDDv6onR9X2WexdNZEqsxkQZ9/XtmnWjN/Bf9F0VkTqSLOuYJ5fBpjU8N61ZQhN+St41vpFcqqVfW8G0SERDqLc1PBMb/sU1WZf95xKfNfHJ01kSqS8JdwzO/zD1R6GcfllH9cGvBdHJ01kSoSLN6eq/kXDPhWUeZfr8y/XDprIlVkIukF/8aizD9iUB+t3ERry01ln/LprIlUkanMv7DmH6+LLulKWiutsUpbWJeTzppIFZnM1fwLyz7VVO+Hwumplfkvjs6aSBUJyj7FA77V1OkDUBeN5AZ6FfwXR2dNpIpML/t4ff7VWBoJPs3E1e2zKDprIlUkkfKDf11Q6/d+xast84epkpYy/8Up66yZ2Z+a2RNmts/MvmFmG/Luu8PMDprZATO7KW/71Wb2pH/fh62aRqFEVtiEH/yDwBiJGI310aqa1C0QlLSqqYV1OZX7J/NDzrmXOeeuBL4KvB/AzHYBtwKXA3uAu80s+B/6KHAbsMP/2lPmPojUpA99fT/vv/+pgm3FA77gdfxUZ9lHmX85yjprzrnhvJvNgPO/vxm41zmXcM4dAQ4C15jZeqDNOfeQc84BnwFuKWcfRGrV48fOs/foUMG23IBvXeG0DtVY9mlS2acsZS/mYmZ3AW8HLgCv8zdvBB7Oe9hJf1vK/754+0zPfRvepwQ2b95c7q6KVJVUJpsb4A0Et/P74Fc11tMar751m3I1fw34LsqcZ83MvmVmT5X4uhnAOXenc24T8HngPcGPlXgqN8v2kpxz9zjn+pxzfd3d3XMfjUgNSWayubl8AkHZJ7/M86Ffehnv27NzWfdtOTSq7FOWOdMB59yN83yu/wv8M/ABvIx+U959PcBpf3tPie0iskDJdJbJVFHmn8oQr4sUXM17+YZVy71ry6KhvrCjSRam3G6fHXk33wzs979/ALjVzOJmthVvYPdR59wLwIiZXed3+bwduL+cfRCpVclM6eBf6cXTLxbK/MtTbiHwg2a2E8gCx4B3ATjnnjaz+4BngDTwbudc8C69HfgU0Ag86H+JyAJ5Nf/isk9mWRZqvxioz788ZQV/59x/mOW+u4C7SmzfC1xRzuuKiFf2yWQdqUyWen/QcyJvCcdq16iyT1l01kRCKpXxeiXySz+TqUxVtnWWkuvzj9bG8S41BX+RkEr6JZ/JvI6fWgr+QdknXiOfdJaazppISCUzQfAvzvxr49e6SX3+ZdFZEwkh51xe5j8V/CdSmdxMntVuW1cLrfE6OlpiK70roVQb7xKRKpPOTl0bmV/2GU9mCta3rWav3tHFk39y09wPlJKU+YuEUDKvxTN/iofJZO30+Ut5FPxFQiiVKRzkDYynMrlauMhsFPxFQqgg888r+0zUUNlHyqPgLxJCifT0zD+TdSTSWZV9ZF4U/EVCqFTZJ7eKlzJ/mQcFf5EQSuYHf/9TQLCQi2r+Mh8K/iIhlEpPtXoGi7YHwb+xRvr8pTwK/iIhlMwUXtULMJ5KAyr7yPwo+IuEUDI9/SIvlX1kIRT8RUIoWWrANzl9/V6RmSj4i4RQqsQVvur2kYVQ8BcJofzMfyLpfT+uso8sgIK/SAgFff4Ry8v8/eBfK/P5S3kU/EVCKLjCtyVeN9XqmVLmL/On4C8SQkHm39ZYn+v2mSr7qM9f5qbgLxJCwcRurQ31ed0+Xp+/FjSX+dC7RCSEcpl/Q11Bt09jfZRIxKcp4cAAAAs7SURBVFZy1yQkFPxFQijI/IvLPqr3y3wp+IuEUDLjXeHbGq8ruMhLnT4yXwr+IiGUTGeJRSM0xKJT0ztoFS9ZAAV/kRBKZbLUR42Gumiu1VNlH1kIBX+REEqms8TqIjTURwoGfFX2kflS8BcJIS/zj9BQHyWVcWSyjgll/rIACv4iIZSf+YM3s+d4Mq0ZPWXeFPxFQiiR8Qd8/TLPZCrDZCpLY72u7pX5UfAXCaFUkPnXecF/ws/8VfaR+VLwFwmhpF/zj+fKPlnGkxmVfWTeFPxFQiiVCWr+fuafzJBIZ7WQi8ybgr9ICOUu8vKD/fmJJKDpnGX+FPxFQiiZcdTXRWjwZ/A8N+YFf5V9ZL4U/EVCKMj8g2B/fjwFaP1emT8Ff5EQ8mr+liv7KPOXhVLwFwmhXM3fb/UcGlfNXxZGwV8khKamd/B+hYdyZR9d5CXzsyTB38x+x8ycmXXlbbvDzA6a2QEzuylv+9Vm9qR/34fNTMsOiSxQML1D3C/7DKnsIwtUdvA3s03AG4Djedt2AbcClwN7gLvNLHhXfhS4Ddjhf+0pdx9Eak2yKPMPav4q+8h8LUXm/1fA+wCXt+1m4F7nXMI5dwQ4CFxjZuuBNufcQ845B3wGuGUJ9kGkpiTTWeJ1EWLRCGZw3q/5q9tH5qus4G9mbwZOOed+UnTXRuBE3u2T/raN/vfF22d6/tvMbK+Z7R0YGChnV0WqSlDzN/MWdDk3rrKPLMyco0Nm9i1gXYm77gT+AHhjqR8rsc3Nsr0k59w9wD0AfX19Mz5OpJakM1myDmL+BV4N9ZHcgK/KPjJfcwZ/59yNpbab2UuBrcBP/DHbHuBxM7sGL6PflPfwHuC0v72nxHYRmaeUv3h7fTQI/lHAC/5B66fIXBZd9nHOPemcW+Oc63XO9eIF9t3OuTPAA8CtZhY3s614A7uPOudeAEbM7Dq/y+ftwP3lH4ZI7UimvQXbpzL/qP9vhEhEzXMyPxVpCnbOPW1m9wHPAGng3c65jH/37cCngEbgQf9LROYpmfGDf9QL9HH/j0BTTD3+Mn9L9m7xs//823cBd5V43F7giqV6XZFakwv+ftAPBnnV6SMLoSt8RUImVVz28ev86vSRhVDwFwmZIPOfGvANyj4K/jJ/Cv4iIZMb8I0WD/gq+Mv8KfiLhEwu8y/q9lHmLwuh4C8SMkHmH1fZR8qg4C8SMqmizD9ep7KPLJyCv0jIzFTzV+YvC6HgLxIyqRm6fdTnLwuh4C8SMokZpndo1BW+sgAK/iIhE0zsliv71GnAVxZOwV8kZGaa2E1lH1kIBX+RkJmq+XsTu+Xm9lHmLwug4C8SMsWZf9DqqbKPLISCv0jIzDS3j8o+shAK/iIhU9znv6a1AYC1bQ0rtk8SPuoNEwkZb/F2y63atWtDG9/73dexubNphfdMwkSZv0jIJNPZXMknoMAvC6XgLxIyqUw2N9grslh6B4mETDIzPfMXWSi9g0RCJpl2ucFekcXSO0gkZJIq+8gS0DtIJGRS6awyfymb3kEiIZPMZKmvs5XeDQk5BX+RkEkq85cloHeQSMio20eWgt5BIiGTTGvAV8qnd5BIyKQyKvtI+fQOEgkZZf6yFPQOEgmZlGr+sgT0DhIJGWX+shT0DhIJmWTGKfhL2fQOEgmZZDqjAV8pm95BIiGTUuYvS0DvIJGQSforeYmUQ8FfJEQyWUcm64hFtVi7lEfBXyREUhlv8XZN7CblUvAXCZGkH/w14Cvl0jtIJESSaT/4a8BXyqR3kEiIpJT5yxIp6x1kZn9sZqfMbJ//9aa8++4ws4NmdsDMbsrbfrWZPenf92EzU/FSZJ6CzF/TO0i5luId9FfOuSv9r38BMLNdwK3A5cAe4G4zC9oTPgrcBuzwv/YswT6I1IRc5q+yj5SprkLPezNwr3MuARwxs4PANWZ2FGhzzj0EYGafAW4BHqzQfvDrn/4Rx86OV+rpRZZVQpm/LJGlCP7vMbO3A3uB/+acGwI2Ag/nPeakvy3lf1+8vSQzuw3vUwKbN29e1M5t7mhWliRVpa+3nVf0tq/0bkjIzRn8zexbwLoSd92JV8L5U8D5//4l8E6gVB3fzbK9JOfcPcA9AH19fTM+bjbv//ldi/kxEZGqNmfwd87dOJ8nMrP/A3zVv3kS2JR3dw9w2t/eU2K7iIgso3K7fdbn3fwF4Cn/+weAW80sbmZb8QZ2H3XOvQCMmNl1fpfP24H7y9kHERFZuHJr/n9uZlfilW6OAr8B4Jx72szuA54B0sC7nXMZ/2duBz4FNOIN9FZssFdEREoz5xZVSl92fX19bu/evSu9GyIioWJmjznn+oq3qw1GRKQGKfiLiNQgBX8RkRqk4C8iUoNCM+BrZgPAsUX+eBcwuIS7sxKq4RigOo6jGo4BquM4quEYoLLHscU51128MTTBvxxmtrfUaHeYVMMxQHUcRzUcA1THcVTDMcDKHIfKPiIiNUjBX0SkBtVK8L9npXdgCVTDMUB1HEc1HANUx3FUwzHAChxHTdT8RUSkUK1k/iIikkfBX0SkBlV18DezPf4C8gfN7PdXen/my8w2mdl3zOxZM3vazH7L395hZt80s+f9fy/65ZzMLGpmPzazr/q3w3gMq83sH8xsv/9/8sqwHYeZvdd/Lz1lZl8ws4YwHIOZfdLM+s3sqbxtM+63md3h/74fMLObVmavC81wDB/y309PmNlXzGx13n3LcgxVG/z9BeP/FvgZYBfwn/yF5cMgjbck5mXAdcC7/X3/feDbzrkdwLf92xe73wKezbsdxmP4G+BrzrlLgZfjHU9ojsPMNgL/Behzzl0BRIFbCccxfArYU7St5H77vyO3Apf7P3O3HwdW2qeYfgzfBK5wzr0MeA64A5b3GKo2+APXAAedc4edc0ngXryF5S96zrkXnHOP+9+P4AWbjXj7/2n/YZ8GblmZPZwfM+sBfhb4eN7msB1DG/Ba4BMAzrmkc+48ITsOvLU7Gs2sDmjCW0Hvoj8G59z3gHNFm2fa75uBe51zCefcEeAgXhxYUaWOwTn3Dedc2r/5MFMrHC7bMVRz8N8InMi7Peti8RcrM+sFrgIeAdb6q6Hh/7tm5fZsXv4aeB+QzdsWtmPYBgwAf++Xrz5uZs2E6Dicc6eAvwCOAy8AF5xz3yBEx1Bkpv0O6+/8O5la1GrZjqGag/+CFou/GJlZC/Al4Ledc8MrvT8LYWY/B/Q75x5b6X0pUx2wG/ioc+4qYIyLszwyI78mfjOwFdgANJvZ21Z2ryoidL/zZnYnXpn388GmEg+ryDFUc/CfaRH5UDCzerzA/3nn3Jf9zS8G6yb7//av1P7Nw/XAm83sKF7J7afN7HOE6xjAex+ddM494t/+B7w/BmE6jhuBI865AedcCvgy8CrCdQz5ZtrvUP3Om9k7gJ8DftlNXXC1bMdQzcH/R8AOM9tqZjG8QZQHVnif5sVf3P4TwLPOuf+Vd9cDwDv8798B3L/c+zZfzrk7nHM9zrlevHP/r865txGiYwBwzp0BTpjZTn/T6/HWpg7TcRwHrjOzJv+99Xq8caQwHUO+mfb7AeBWM4ub2VZgB/DoCuzfnMxsD/B7wJudc+N5dy3fMTjnqvYLeBPeSPoh4M6V3p8F7Per8T7qPQHs87/eBHTidTc87//bsdL7Os/juQH4qv996I4BuBLY6/9//CPQHrbjAP4E2A88BXwWiIfhGIAv4I1TpPCy4l+bbb+BO/3f9wPAz6z0/s9yDAfxavvB7/fHlvsYNL2DiEgNquayj4iIzEDBX0SkBin4i4jUIAV/EZEapOAvIlKDFPxFRGqQgr+ISA36/+q6jxCvItE5AAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "def play_episode(env, agent, max_episode_steps=None, mode=None, render=False):\n",
    "    observation, reward, done = env.reset(), 0., False\n",
    "    agent.reset(mode=mode)\n",
    "    episode_reward, elapsed_steps = 0., 0\n",
    "    while True:\n",
    "        action = agent.step(observation, reward, done)\n",
    "        if render:\n",
    "            env.render()\n",
    "        if done:\n",
    "            break\n",
    "        observation, reward, done, _ = env.step(action)\n",
    "        episode_reward += reward\n",
    "        elapsed_steps += 1\n",
    "        if max_episode_steps and elapsed_steps >= max_episode_steps:\n",
    "            break\n",
    "    agent.close()\n",
    "    return episode_reward, elapsed_steps\n",
    "\n",
    "\n",
    "logging.info('==== train & verify ====')\n",
    "episode_rewards = []\n",
    "for episode in itertools.count():\n",
    "    play_episode(env.unwrapped, agent,\n",
    "            max_episode_steps=env._max_episode_steps, mode='train')\n",
    "    episode_reward, elapsed_steps = play_episode(env, agent)\n",
    "    episode_rewards.append(episode_reward)\n",
    "    logging.debug('verify episode %d: reward = %.2f, steps = %d',\n",
    "            episode, episode_reward, elapsed_steps)\n",
    "    if np.mean(episode_rewards[-10:]) > -120:\n",
    "        break\n",
    "plt.plot(episode_rewards)\n",
    "\n",
    "\n",
    "logging.info('==== test ====')\n",
    "episode_rewards = []\n",
    "for episode in range(100):\n",
    "    episode_reward, elapsed_steps = play_episode(env, agent)\n",
    "    episode_rewards.append(episode_reward)\n",
    "    logging.debug('test episode %d: reward = %.2f, steps = %d',\n",
    "            episode, episode_reward, elapsed_steps)\n",
    "logging.info('average episode reward = %.2f ± %.2f',\n",
    "        np.mean(episode_rewards), np.std(episode_rewards))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "env.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
